Section 1: Cases and Masks, Kate

Section 2: Cases and FB cmnty CLI, Vishnu



Section 1 Cases and Masks

Get the case and signal data and prepare it

Retain only those counties that:
1. have over a certain amount of cases (2000 total in this case)
2. have signal data for at least a certain amount of days (50 in this case)

## Split CASES into lists by geo
geos_case = cases %>%  group_keys(geo_value) %>% unlist()
case_list = cases %>% group_by(geo_value) %>% select(geo_value, time_value, value) %>% arrange(time_value) %>% group_split() %>% setNames(geos_case)

## Split SIGNAL into lists by geo
mask_list = masks %>% group_by(geo_value) %>% select(geo_value, time_value, value) %>% arrange(time_value) %>% group_split()
geos_masks = lapply(mask_list, function(a) a %>% select(geo_value) %>% unlist() %>% unique()) %>% unlist()
mask_list = mask_list %>% setNames(geos_masks)

## Combine and sort by overlapping geos
case_list = case_list[geos_masks]
# assert_that(all(names(case_list) == names(mask_list)))

## Retain only the geos averaging more than X cases a day
# X = sum / numdays
# X = 2000 / 91
# X = ~22
large_geos = names(case_list)[which(sapply(case_list, function(a) a %>% summarize(sum(value))) > 2000)]
case_list = case_list[large_geos]
mask_list = mask_list[large_geos]
large_geos = names(mask_list)[which(sapply(mask_list, function(a) a %>% summarize(nrow(a))) > 50)]
mask_list = mask_list[large_geos]
case_list = case_list[large_geos]
# assert_that(all(names(case_list) == names(mask_list)))

Smooth and take derivatives

  1. Scale the case and signal data using scale function.
  2. Smooth the data using the ksmooth function with a chosen bandwidth (12 in this case).
  3. Use Ryan’s estimate_deriv function (in this case using the ss param) to calculate local derivatives
  4. Use the same function to calculate second derivatives
  5. Mark “rise points” for cases (“fall points” for mask signal) by finding points where
  • the first derivative crosses a certain threshold (0.03 for cases and -0.0007 for masks)
  • the second derivative is above (or below) a certain threshold (0.0007 in this case)
  • the first derivative is still above the threshold some days out

Plot

The plots with the red dots are cases, and the dots mark the beginning of a rise.
The plots with blue dots are the signal and, in this case, the dots mark the beginning of a decline in mask usage.
The plots come in pairs of twos, with the case plot above the mask plot for each county.

Here are some examples:

bw = 12
deriv_1_threshold = .03
deriv_2_threshold = 0.0007
deriv_1_threshold_masks = -0.007

smooth_case_list = case_list
smooth_mask_list = mask_list
smooth_case_deriv_list = vector(mode = "list", length = length(case_list))
smooth_mask_deriv_list = vector(mode = "list", length = length(mask_list))

par(mfcol = c(2,3))
# par(mar=c(3,4,1,1))

# To find rise points
# TODO can be cleaner
get_rise_points = function(data) {
  point = ifelse(data$first_deriv >= deriv_1_threshold,
      ifelse(lag(data$first_deriv) < deriv_1_threshold,
      ifelse(data$second_deriv > deriv_2_threshold,
      ifelse(lead(data$first_deriv, 2) > deriv_1_threshold,
      ifelse(lead(data$first_deriv, 4) > deriv_1_threshold,
      ifelse(lead(data$first_deriv, 6) > deriv_1_threshold,
      ifelse(lead(data$first_deriv, 8) > deriv_1_threshold,
      ifelse(lead(data$first_deriv, 10) > deriv_1_threshold, TRUE, FALSE), FALSE), FALSE), FALSE), FALSE), FALSE), FALSE), FALSE)
}

# To find fall point
# TODO can be cleaner
get_fall_points = function(data) {
  ifelse(data$first_deriv <= deriv_1_threshold_masks,
  ifelse(lag(data$first_deriv) > deriv_1_threshold_masks, 
  ifelse(data$second_deriv < deriv_2_threshold, 
  ifelse(lead(data$first_deriv, 2) < deriv_1_threshold_masks,
  ifelse(lead(data$first_deriv, 4) < deriv_1_threshold_masks,
  ifelse(lead(data$first_deriv, 6) < deriv_1_threshold_masks,
  ifelse(lead(data$first_deriv, 8) < deriv_1_threshold_masks, TRUE, FALSE), FALSE), FALSE), FALSE), FALSE), FALSE), FALSE)
}

for(ii in (1:length(case_list))){
  # Smooth cases and signal
  smooth_case_list[[ii]]$value = case_list[[ii]] %>% mutate(value=scale(value)) %>% pull(value) %>% sm(bw)
  smooth_mask_list[[ii]]$value = mask_list[[ii]]  %>% mutate(value=scale(value)) %>% pull(value) %>% sm(bw)
  
  # Create cols with first derivs for cases and signal
  smooth_case_deriv <- modeltools::estimate_deriv(smooth_case_list[[ii]], method = "ss", col_name="first_deriv", n=28)
  smooth_mask_deriv <- modeltools::estimate_deriv(smooth_mask_list[[ii]], method = "ss", col_name = "first_deriv", n = 28)
  
  # Create cols with second derivs
  # TODO can clean up
  smooth_case_deriv_2 <- smooth_case_deriv
  smooth_case_deriv_2$value = smooth_case_deriv$first_deriv
  smooth_case_deriv_2 = modeltools::estimate_deriv(smooth_case_deriv_2, method = "ss", col_name="second_deriv", n=28)
  smooth_case_deriv = smooth_case_deriv %>% add_column(second_deriv = NA)
  smooth_case_deriv$second_deriv = smooth_case_deriv_2$second_deriv
  
  smooth_mask_deriv_2 <- smooth_mask_deriv
  smooth_mask_deriv_2$value = smooth_mask_deriv$first_deriv
  smooth_mask_deriv_2 = modeltools::estimate_deriv(smooth_mask_deriv_2, method = "ss", col_name="second_deriv", n=28)
  smooth_mask_deriv = smooth_mask_deriv %>% add_column(second_deriv = NA)
  smooth_mask_deriv$second_deriv = smooth_mask_deriv_2$second_deriv
  
  # Mark rise points
  smooth_case_deriv = smooth_case_deriv %>% add_column(rise_point = FALSE)
  smooth_case_deriv$rise_point = get_rise_points(smooth_case_deriv)
  smooth_mask_deriv = smooth_mask_deriv %>% add_column(fall_point = FALSE)
  smooth_mask_deriv$fall_point = get_fall_points(smooth_mask_deriv)
  
  # Save data for next step
  smooth_case_deriv_list[[ii]] = smooth_case_deriv
  smooth_mask_deriv_list[[ii]] = smooth_mask_deriv
  
  # Plot it
  if (ii >= 215 && ii < 230) {
    county = smooth_case_deriv[[1, 'geo_value']]
    smooth_case_deriv  %$% plot(x=time_value, y=value, type="l", main = county)
    smooth_case_deriv %>% filter(rise_point) %$% points(x=time_value, y=value, col="red", lwd=10)
  
    county = smooth_mask_deriv[[1, 'geo_value']]
    smooth_mask_deriv  %$% plot(x=time_value, y=value, type="l", main = county)
    smooth_mask_deriv %>% filter(fall_point) %$% points(x=time_value, y=value, col="blue", lwd=10)
  }
}

Calculate recall and precision

Now that we have our method for determining points of interest, let’s look at where the signal and case points of interest occur in relation to each other for each county where there is at least one point of interest for the case and the signal.

We measure recall as number of counties where there is at least one signal point of interest that falls some number of days or fewer before a case point of interest divided by the number of counties where there is at least one case point of interest. When we observed a case rise, how many times did a point of interest in our signal precede it?

We measure precision as number of counties where there is at least one signal point of interest that falls some number of days or fewer before a case point of interest divived by the number of counties where there is at least one signal point of interest. When we observed a point of interest in our signal, how many times did it precede a rise in cases?

In this case the number of days we look at is 21 in the case of mask usage, if it has an impact on case incidence it probably takes a few weeks. This number (along with all the other thresholding numbers in this notebook) is up for discussion.

Also up for discussion is how we measure success: are we looking at the county level, or at the “rise point” level, where we perform the recall and precision calculations for all points of interest, rather than for each county? What counts as success on the county level? At least one match between case and signal (as done here), or all points matched?

# Narrow down counties to those that have a rise point for both cases and signal
success_count = 0
case_total_count = 0
mask_total_count = 0
for (i in (1:length(smooth_case_deriv_list))) {
  case_points = TRUE %in% smooth_case_deriv_list[[i]]$rise_point
  if (case_points) {case_total_count = case_total_count + 1}
  mask_points = TRUE %in% smooth_mask_deriv_list[[i]]$fall_point
  if(mask_points){mask_total_count = mask_total_count + 1}
  if(case_points && mask_points) {
    case_dates = na.omit(smooth_case_deriv_list[[i]]$time_value[smooth_case_deriv_list[[i]]$rise_point==TRUE])
    mask_dates = na.omit(smooth_mask_deriv_list[[i]]$time_value[smooth_mask_deriv_list[[i]]$fall_point==TRUE])
    # Check if any of the mask points are <= 21 days before any of the case points
    for (j in (1:length(mask_dates))) {
      for (k in (1:length(case_dates))) {
        if(case_dates[k] - mask_dates[j] <= 21 && case_dates[k] - mask_dates[j] > 0) {
          success_count = success_count + 1
        }
      }
    }
  }
}

Success count (‘mask drop’ <= 21 days before ‘case rise’):

success_count
## [1] 198

Total number of counties with at least one ‘case rise’:

case_total_count
## [1] 412

Total number of counties with at least one ‘mask drop’:

mask_total_count
## [1] 344

Recall: success count / counties with ‘case rise’:

success_count / case_total_count
## [1] 0.4805825

Precision: success count / counties with ‘mask drop’:

success_count / mask_total_count
## [1] 0.5755814

Appendix

Some salient examples. Here are some good examples that show where mask usage drops before cases rise:

par(mfcol = c(2,3))

# Plots a list of examples
plot_examples = function(examples) {
  for (i in (1:length(smooth_case_deriv_list))) {
    if (smooth_case_deriv_list[[i]]$geo_value[1] %in% examples) {
      county = smooth_case_deriv_list[[i]]$geo_value[1]
      smooth_case_deriv_list[[i]] %$% plot(x=time_value, y=value, type="l", main = county)
      smooth_case_deriv_list[[i]] %>% filter(rise_point) %$% points(x=time_value, y=value, col="red", lwd=15)
      smooth_mask_deriv_list[[i]] %$% plot(x=time_value, y=value, type="l", main = county)
      smooth_mask_deriv_list[[i]] %>% filter(fall_point) %$% points(x=time_value, y=value, col="blue", lwd=15)
    }
  }
}

# Some good examples
good_examples = list("01101", "06059", "08059", "12015", "18057", "37119", "39025", "41005", "42011", "45083")
plot_examples(good_examples)

Here are some bad examples that show counter examples to this trend, and also instances where the algorithm to find the rise/drop points showed some weakness:

par(mfcol = c(2,3))

# Some bad examples
bad_examples = list("12001", "12011", "21067", "28121", "29095", "34003", "45063", "48201")
plot_examples(bad_examples)

Section 2

Leading Indicators DAP

Section 1: Approach 1 for selecting time periods

In this approach we identify peaks (30-day window), select for the preceding local minima (7-day window), then select 14-days before this minima. We will first examine if this approach selects for the intended periods.

suppressMessages(library(covidcast))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
case_counts = readRDS(file="../case_counts.RDS")
cli=readRDS(file="../cli_signal.RDS")
peaks <- function(x, halfWindowSize) {
  #https://stackoverflow.com/questions/10892803/calculate-peak-values-in-a-plot-using-r
  windowSize <- halfWindowSize * 2 + 1
  windows <- embed(x, windowSize)
  #max.col(windows, "first") finds the maximum position for each row of a matrix, where first indicates how to break ties
  localMaxima <- max.col(windows, "first") == halfWindowSize + 1
  return(c(rep(FALSE, halfWindowSize), localMaxima, rep(FALSE, halfWindowSize)))
}
rescale_vector = function(x){
  (x - min(x))/(max(x) - min(x))
}
  # Function to transform from one range to another
trans = function(x, from_range, to_range) {
  (x - from_range[1]) / (from_range[2] - from_range[1]) *
    (to_range[2] - to_range[1]) + to_range[1]
}

#taken from previous DAP (Ryan)
# Red, cyan (similar to ggplot defaults), then yellow
ggplot_colors = c("#00AFBB", "#FC4E07")

# Function to produce a plot comparing the signals for one county
df_in = case_counts
df_fb = cli
plot_one = function(geo_value, title = NULL, xlab = NULL,
                    ylab1 = NULL, ylab2 = NULL, legend =  TRUE
                    ) {
  # Filter down the signal data frames
  given_geo_value = geo_value
  df_fb_one = df_fb %>% filter(geo_value == given_geo_value)
  df_in_one = df_in %>% filter(geo_value == given_geo_value)
  
  # Compute ranges of the two signals
  range1 = df_in_one %>% select("value") %>% range
  range2 = df_fb_one %>% select("value") %>% range
  
  # Convenience functions for our two signal ranges
  trans12 = function(x) trans(x, range1, range2)
  trans21 = function(x) trans(x, range2, range1)
  
  # Find state name, find abbreviation, then set title
  state_name = fips_to_name(paste0(substr(geo_value, 1, 2), "000"))
  state_abbr = name_to_abbr(state_name)
  title = paste0(fips_to_name(geo_value), ", ", state_abbr)
  
  # Transform the combined signal to the incidence range, then stack
  # these rowwise into one data frame
  df = select(rbind(df_fb_one %>% mutate_at("value", trans21),
                    df_in_one), c("time_value", "value"))
  df$signal = c(rep("% CLI-in-community", nrow(df_fb_one)),
                rep("New COVID-19 cases", nrow(df_in_one)))
  #df = df[which(df$time_value >= min_time_value & df$time_value <= max_time_value),]
  # Finally, plot both signals
  pos = ifelse(legend, "bottom", "none")
  return(ggplot(df, aes(x = time_value, y = value)) +
           geom_line(aes(color = signal), size = 1.5) +
           scale_color_manual(values = ggplot_colors[1:2]) +
           scale_y_continuous(name = ylab1, limits = range1,
                              sec.axis = sec_axis(trans = trans12,
                                                  name = ylab2)) +
           labs(title = title, x = xlab) + theme_bw() +
           theme(legend.pos = pos, legend.title = element_blank(), 
                 axis.text = element_text(size = 12),
                 legend.text = element_text(size = 12),
                 axis.title = element_text(size = 12),
                 title = element_text(size = 13)))
}
suppressMessages(library(magrittr))
suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
suppressMessages(library(pspline))
suppressMessages(library(lubridate))## Helper smoother function
sm <- function(y, bandwidth = 2){
  n = length(y)
  return(ksmooth(x = 1:(n-1), y = y, bandwidth = bandwidth, x.points = 1:n, kernel="normal")$y)
}
caselist=split(data.table::as.data.table(case_counts), by = "geo_value")
fblist=split(data.table::as.data.table(cli), by = "geo_value")
#santa clara 234, bronx ny 1863, miami dade county 372
example_ind = 372
fb_matching_ind = which(names(fblist) %in% names(caselist)[example_ind])
tt = 1:length(caselist[[example_ind]]$time_value)
fbtt = 1:length(fblist[[fb_matching_ind]]$time_value)
#myspline=splinefun(tt, sm(caselist[[2]]$value, 10))

case_signal_normalized = rescale_vector(caselist[[example_ind]]$value)
fb_signal_normalized = rescale_vector(fblist[[fb_matching_ind]]$value)
case_first_deriv = splinefun(tt, sm(case_signal_normalized, 10))(tt, 1)
case_second_deriv = splinefun(tt, sm(case_signal_normalized, 10))(tt, 2)
fb_first_deriv = splinefun(fbtt, sm(fb_signal_normalized, 10))(fbtt, 1)
fb_second_deriv = splinefun(fbtt, sm(fb_signal_normalized, 10))(fbtt, 2)

par(mfrow =c(4,2))
#Raw Signals Plotted
plot(caselist[[example_ind]]$time_value, caselist[[example_ind]]$value, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Case Counts"), col = 'red')
plot(fblist[[fb_matching_ind]]$time_value, fblist[[fb_matching_ind]]$value, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " FB Community Survey"), col = 'cyan')

#Scaled Signals
plot(caselist[[example_ind]]$time_value, case_signal_normalized, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Case Counts (Scaled)"), col = 'red')
plot(fblist[[fb_matching_ind]]$time_value, fb_signal_normalized, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " FB Community Survey (Scaled)"), col = 'cyan')

#Smoothed Signals (Scaled)
plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Case Counts (Smoothed & Scaled)"), col = 'red')
plot(fblist[[fb_matching_ind]]$time_value, sm(fb_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " FB Community Survey (Smoothed & Scaled)"), col = 'cyan')

#First Derivative
plot(caselist[[example_ind]]$time_value, case_first_deriv, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16,col = 'red', main = paste0(caselist[[example_ind]]$geo_value[1], " Case Counts (First Derivative & Scaled)"))
plot(fblist[[fb_matching_ind]]$time_value, fb_first_deriv, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16,col = 'cyan', main = paste0(caselist[[example_ind]]$geo_value[1], " FB Community Survey (First Derivative & Scaled)"))

Overlayed Plots Example

par(mfrow = c(2,1))
#Overlayed Signals
plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Case Counts & FB Comm. Survey (Smoothed & Scaled)"), col = 'red',
     ylim=c(0,1))
lines(fblist[[fb_matching_ind]]$time_value, sm(fb_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, col = 'cyan')

plot(caselist[[example_ind]]$time_value, case_first_deriv, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Case Counts & FB Comm. Survey (First Derivative & Scaled)"), col = 'red',
     ylim=c(min(case_first_deriv, fb_first_deriv),max(case_first_deriv, fb_first_deriv)))
lines(fblist[[fb_matching_ind]]$time_value, fb_first_deriv, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, col = 'cyan')

threshold = 0.20
par(mfrow = c(6,1))
plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 1: First derivative > 0 and above 75th percentile"), 
     col = ifelse(case_first_deriv>quantile(case_first_deriv, 0.75) & 
                   case_first_deriv>0 , 'black', 'red'), ylim=c(0,1))

plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 2: Second Derivative < 0"), col = ifelse(case_second_deriv < 0, 'black', 'red'), ylim=c(0,1))

plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 3: First Derivative >= 0 and Second Derivative Negative (Local Max)"), col = ifelse( 
                   (case_first_deriv>= 0) & 
                     case_second_deriv < 0, 'black', 'red'), ylim=c(0,1))

#plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, #main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 3b: Regions where signal is flat"), col = #ifelse(abs(case_first_deriv) <= quantile(abs(case_first_deriv), 0.5), 'black', 'red'), ylim=c(0,1))


flat_regions = which(abs(case_first_deriv) <= quantile(abs(case_first_deriv), 0.5))
s <- split(flat_regions, cumsum(c(TRUE, diff(flat_regions) != 1)))
s <- unlist(lapply(s, function(x) {
  if(length(x) > 5)
    x
}), use.names = F)
plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 3c: Regions where signal is flat"), col = ifelse(1:length(caselist[[example_ind]]$time_value) %in% s, 'black', 'red'), ylim=c(0,1))


increasing_period= which(case_first_deriv>quantile(case_first_deriv, 0.75) & case_first_deriv>0)
s <- split(increasing_period, cumsum(c(TRUE, diff(increasing_period) != 1)))
s <- unlist(lapply(s, function(x){
 if(sm(case_signal_normalized,10)[max(x)]/sm(case_signal_normalized,10)[min(x)] > (1+threshold))
 {
   c(x, (max(x)+1):(max(x)+5))
 }
}), use.names = F)
#s <- unlist(lapply(s, function(x) { c(x, (max(x)+1):(max(x)+5))}), use.names = F)
plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 4: First Derivative > 0 and above 75th percentile and include extra 5 days to the right (to capture local max) and \n has > 20% increase in cases"), col = ifelse(1:length(caselist[[example_ind]]$time_value) %in% s, 'black', 'red'), ylim=c(0,1))


increasing_period= which(fb_first_deriv>quantile(fb_first_deriv, 0.75) & fb_first_deriv>0)
s <- split(increasing_period, cumsum(c(TRUE, diff(increasing_period) != 1)))
s <- unlist(lapply(s, function(x){
 if(sm(fb_signal_normalized,10)[max(x)]/sm(fb_signal_normalized,10)[min(x)] > (1+threshold))
 {
   c(x, (max(x)+1):(max(x)+5))
 }
}), use.names = F)
plot(fblist[[fb_matching_ind]]$time_value, sm(fb_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, col = ifelse(1:length(fblist[[fb_matching_ind]]$time_value) %in% s, 'black', 'cyan'),
    main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 4 (FB Survey): First Derivative > 0 and above 75th percentile and include extra 5 days to the right (to capture local max) \n and has >20% increase in FB survey"), ylim=c(0,1))

Extending Approach to all counties

Identifying periods of increases for all counties, calculating precision, recall

  1. Are significant increases in cases usually preceded by significant increases in FB? (Recall)
  2. Are significant increases in FB seldom where cases remains flat? (Precision)
  1. For each county, we first check if the county has any negative case counts, all 0 case counts, or no matching FB signals. If any of these criteria are true, we skip this particular county.
  2. Next, we merge the FB community signal and case signal on time/dates.
  3. Then, scale both signals to be between 0 and 1.
  4. Next, calculate the first derivative of the case and FB signal and select for regions where these derivatives are > 0 (increasing cases or FB survey). During this selection, we also check if this time period meets the threshold for a significant increase.We save this information into a list, so that it can be accessed later for plotting purposes.
  5. Then, we identify the start of the increase for the FB signal and the start of the increase for the case signal.
Recall: (num. times where case increase is preceded by FB increase by margin of 2 weeks or less)/(num. significant increase in cases)
Precision: 1-(num FB has increase in this region)/(num cases remain flat)
caselist=split(data.table::as.data.table(case_counts), by = "geo_value")
fblist=split(data.table::as.data.table(cli), by = "geo_value")
#needs to be put in a loop, this is for individual cases

num_significant_cases_increases = 0
num_positive_recall_cases=0
num_cases_remain_flat = 0
num_fb_increase_flat_case_regions=0
threshold = 0.2
processed_signals_list=vector("list", length(caselist))
precision_performance_list=vector("list", length(caselist))
recall_performance_list=vector("list", length(caselist))
names(processed_signals_list)<-names(caselist)
names(precision_performance_list)<-names(caselist)
names(recall_performance_list)<-names(caselist)

for(i in 1:length(caselist))
{
  if(i %% 100 == 0)
    cat(i, " of ", length(caselist), "\n")
  if(any(caselist[[i]]$value < 0) | var(caselist[[i]]$value) == 0 | !(names(caselist)[i] %in% names(fblist)))
    next
  case_elem = caselist[[i]]
  fb_elem = fblist[[which(names(fblist) %in% names(caselist)[i])]]
  if(nrow(case_elem) < 30 | nrow(fb_elem) < 30)
    next
  colnames(fb_elem)[7] <- c("FB.Value")
  merged_df= merge(x = case_elem, y = fb_elem, by = "time_value")
  
  merged_df$value = rescale_vector(merged_df$value)
  merged_df$FB.Value = rescale_vector(merged_df$FB.Value)
  
  smoothed_county_cases = sm(merged_df$value, 10)
  smoothed_fb_survey = sm(merged_df$FB.Value, 10)
  
  tt = 1:length(merged_df$time_value)

  case_first_deriv = splinefun(tt, smoothed_county_cases)(tt, 1)
  case_second_deriv = splinefun(tt, smoothed_county_cases)(tt, 2)
  fb_first_deriv = splinefun(tt, smoothed_fb_survey)(tt, 1)
  fb_second_deriv = splinefun(tt, smoothed_fb_survey)(tt, 2)
  
  regions_case_increases = which(case_first_deriv>quantile(case_first_deriv, 0.75) & case_first_deriv>0)
  increasing_case_time_period_list <- split(regions_case_increases, cumsum(c(TRUE, diff(regions_case_increases) != 1)))
  
  regions_fb_increases = which(fb_first_deriv>quantile(fb_first_deriv, 0.75) & fb_first_deriv>0)
  increasing_fb_time_period_list <- split(regions_fb_increases, cumsum(c(TRUE, diff(regions_fb_increases) != 1)))

  if(length(regions_case_increases) == 0 | length(regions_fb_increases) == 0)
    next
  
  increasing_case_time_period_list <- lapply(increasing_case_time_period_list, function(x){
    if(smoothed_county_cases[max(x)]/smoothed_county_cases[min(x)] > (1+threshold) & smoothed_county_cases[min(x)] !=0)
    {
      c(x, (max(x)+1):(max(x)+5))
    }
  })
  increasing_case_time_period_list = increasing_case_time_period_list[lengths(increasing_case_time_period_list)!=0]
  num_significant_cases_increases = num_significant_cases_increases + length(increasing_case_time_period_list)
  
  increasing_fb_time_period_list <- lapply(increasing_fb_time_period_list, function(x){
    if(smoothed_fb_survey[max(x)]/smoothed_fb_survey[min(x)] > (1+threshold) & smoothed_fb_survey[min(x)]!=0)
    {
      c(x, (max(x)+1):(max(x)+5))
    }
  })
  increasing_fb_time_period_list = increasing_fb_time_period_list[lengths(increasing_fb_time_period_list)!=0]
  
  does_fb_increase_precede_case <- lapply(increasing_case_time_period_list, function(x){ 
        beginning_case_increase = min(x)
        beginning_fb_increase = unlist(lapply(increasing_fb_time_period_list, min), use.names = F)
        1*any((beginning_case_increase - beginning_fb_increase > 0) & (beginning_case_increase - beginning_fb_increase <= 14))
  })
  
  num_positive_recall_cases = num_positive_recall_cases+sum(unlist(does_fb_increase_precede_case, use.names = F))
  
  
  flat_case_regions = which(abs(case_first_deriv) <= quantile(abs(case_first_deriv), 0.5))
  flat_case_time_period_list <- split(flat_regions, cumsum(c(TRUE, diff(flat_regions) != 1)))
  flat_case_time_period_list <- lapply(flat_case_time_period_list, function(x) {
    if(length(x) > 5)
    {
      x 
    }
  })
  flat_case_time_period_list= flat_case_time_period_list[lengths(flat_case_time_period_list)!=0]
  num_cases_remain_flat = num_cases_remain_flat+length(flat_case_time_period_list)

  does_fb_increase_flat_case <- lapply(flat_case_time_period_list, function(x){ 
        beginning_case_increase = min(x)
        beginning_fb_increase = unlist(lapply(increasing_fb_time_period_list, min), use.names = F)
        1*any((beginning_case_increase - beginning_fb_increase > 0) & (beginning_case_increase - beginning_fb_increase <= 14))
  })

  num_fb_increase_flat_case_regions = num_fb_increase_flat_case_regions+sum(unlist(does_fb_increase_flat_case, use.names = F))
  
  recall_performance_list[[i]]<-sum(unlist(does_fb_increase_precede_case, use.names = F))/length(increasing_case_time_period_list)
  precision_performance_list[[i]]<-1-sum(unlist(does_fb_increase_flat_case, use.names = F))/length(flat_case_time_period_list)

  merged_df$Case_Increasing_Regions = ifelse(1:nrow(merged_df) %in% unlist(increasing_case_time_period_list, use.names = F),1,0)
  merged_df$FB_Increasing_Regions = ifelse(1:nrow(merged_df) %in% unlist(increasing_fb_time_period_list, use.names = F),1,0)
  merged_df$Flat_Case_Regions = ifelse(1:nrow(merged_df) %in% unlist(flat_case_time_period_list, use.names = F),1,0)
  merged_df$Flat_FB_Regions = ifelse(1:nrow(merged_df) %in% unlist(increasing_fb_time_period_list, use.names = F),1,0)
  processed_signals_list[[i]] <- merged_df

}
## 100  of  3193 
## 200  of  3193 
## 300  of  3193 
## 400  of  3193 
## 500  of  3193 
## 600  of  3193 
## 700  of  3193 
## 800  of  3193 
## 900  of  3193 
## 1000  of  3193 
## 1100  of  3193 
## 1200  of  3193 
## 1300  of  3193 
## 1400  of  3193 
## 1500  of  3193 
## 1600  of  3193 
## 1700  of  3193 
## 1800  of  3193 
## 1900  of  3193 
## 2000  of  3193 
## 2100  of  3193 
## 2200  of  3193 
## 2300  of  3193 
## 2400  of  3193 
## 2500  of  3193 
## 2600  of  3193 
## 2700  of  3193 
## 2800  of  3193 
## 2900  of  3193 
## 3000  of  3193 
## 3100  of  3193

What is the overall (Recall,Precision)?

print(c(num_positive_recall_cases/num_significant_cases_increases, 1-num_fb_increase_flat_case_regions/num_cases_remain_flat))
## [1] 0.3609703 0.9014611

What is the F1 score for all counties?

precision_vector=unlist(precision_performance_list, use.names = T)
recall_vector=unlist(recall_performance_list, use.names = T)
matching_counties=intersect(names(precision_vector), names(recall_vector))
performance_df = data.frame(County_Code = matching_counties,
                            Precision = precision_vector[match(matching_counties, names(precision_vector))],
                            Recall = recall_vector[match(matching_counties, names(recall_vector))],
           stringsAsFactors = F)
performance_df$F1_score = 2*(performance_df$Precision * performance_df$Recall)/(performance_df$Precision + performance_df$Recall)
hist(performance_df$F1_score, xlab = "F1 Score", main = "Frequency Distribution of F1 Scores", col = "orange")

performance_df=performance_df[order(performance_df$F1_score, decreasing = T),]
print(performance_df[1:20,])
##       County_Code Precision Recall F1_score
## 01015       01015         1      1        1
## 01045       01045         1      1        1
## 04003       04003         1      1        1
## 06029       06029         1      1        1
## 06095       06095         1      1        1
## 08001       08001         1      1        1
## 08031       08031         1      1        1
## 08043       08043         1      1        1
## 08101       08101         1      1        1
## 12023       12023         1      1        1
## 12075       12075         1      1        1
## 12083       12083         1      1        1
## 12087       12087         1      1        1
## 12089       12089         1      1        1
## 12131       12131         1      1        1
## 13029       13029         1      1        1
## 13045       13045         1      1        1
## 13223       13223         1      1        1
## 17073       17073         1      1        1
## 17179       17179         1      1        1
suppressMessages(library(patchwork))
plot_list=vector("list", 20)
for(i in 1:20)
{
  county_name = performance_df$County_Code[i]
  plot_list[[i]]<-plot_one(county_name, xlab = "Date",
         ylab1 = "Daily new confirmed COVID-19 cases",
         ylab2 = "% of people who know someone with CLI") 
  print(plot_list[[i]])

}